import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import model_selection, metrics, cluster, preprocessing
from sklearn.ensemble import RandomForestRegressor
import statsmodels.formula.api as sm
import copy as cp
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
import random
def wide_to_long(df, new_var_1, new_var_2, old_var_list):
new_df = df.drop(old_var_list[1:], axis=1).rename(columns = {old_var_list[0]:new_var_2})
new_df[new_var_1] = old_var_list[0]
var_list_fl = cp.copy(old_var_list[1:])
for value in var_list_fl:
var_list_temp = cp.copy(old_var_list)
var_list_temp.remove(value)
df_temp = df.drop(var_list_temp, axis=1).rename(columns = {value:new_var_2})
df_temp[new_var_1] = value
new_df = new_df.append(df_temp, ignore_index = True)
return(new_df)
def long_to_wide(df, headers, values):
new_df = df.drop([headers, values], axis=1).drop_duplicates()
merge_cols = new_df.columns.tolist()
name_list = df[[headers]].drop_duplicates()[headers].tolist()
for name in name_list:
temp_df = df.loc[df[headers] == name].rename(columns={values:name}).drop(headers, axis=1)
new_df = new_df.merge(temp_df, on=merge_cols, how='left')
return(new_df)
data = pd.read_csv('data_for_models.csv').drop('Unnamed: 0', axis=1)
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
# add dummies for Christmas and Thanksgiving
data['Christmas'] = 0
data.loc[data.Holiday == 'Christmas', 'Christmas'] = 1
data['Thanksgiving'] = 0
data.loc[data.Holiday == 'Thanksgiving', 'Thanksgiving'] = 1
# add dummy for Store C
data['Type_C'] = 0
data.loc[data.Type == 'C', 'Type_C'] = 1
for i in range(1, 6):
promo = 'Promotion'+str(i)
I0_promo = 'I0_' + promo
data[I0_promo] = data[promo]
data.loc[np.isnan(data[I0_promo]), I0_promo] = 0
rlist = []
for i in range(len(data)):
import random
rlist.append(random.randint(1,101))
data['random_int'] = rlist
max_date = data['Date'].max()
min_date = data['Date'].min()
# missing weekly_sales values
data_temp = pd.DataFrame(pd.pivot_table(data, values='Weekly_Sales', index=['Store', 'Dept'], columns=['Date'])).T
temp_sum = len(data_temp.columns.tolist()) * len(data_temp)
data_temp = pd.DataFrame(data_temp.isnull().sum())
data_temp = data_temp.loc[data_temp[0] != 0]
missing = data_temp[0].sum()
print('Missing ' + str(missing) + ' of ' + str(temp_sum), '\n', 'Percent missing: ' + str(missing/temp_sum * 100))
# testing pivto tables
data_temp = pd.DataFrame(pd.pivot_table(data, values='Weekly_Sales', index=['Store', 'Dept', 'Week'], columns=['Year'])).reset_index()
mean_ws = data[['Weekly_Sales', 'Store', 'Dept', 'Year']]\
.groupby(['Store', 'Dept', 'Year']).mean().reset_index().rename(columns={'Weekly_Sales':'Mean'})
mean_ws = wide_to_long(data_temp, 'Year', 'Weekly_Sales', [2010, 2011, 2012])\
.merge(mean_ws, on=['Store', 'Dept', 'Year'], how='left')\
.rename(columns={'Weekly_Sales':'Imp_WS_Mean'})
mean_ws.loc[np.isnan(mean_ws.Imp_WS_Mean), 'Imp_WS_Mean'] = mean_ws['Mean']
ws = data[['Store', 'Dept', 'Date', 'Weekly_Sales', 'Previous_Year_Sales']]
data2 = mean_ws.drop('Mean', axis=1).merge(data.drop(['Weekly_Sales', 'Previous_Year_Sales'], axis=1).drop_duplicates(),
on=['Store', 'Dept', 'Week', 'Year'], how='left')\
#.merge(ws, on=['Store', 'Dept', 'Date'], how='left')
#data2['Imp_WS_0'] = data2['Weekly_Sales']
#data2.loc[np.isnan(data2.Imp_WS_0), 'Imp_WS_0'] = 0
# remove beginnings and ends added on
data2 = data2.loc[(data2.Date >= min_date) & (data2.Date <= min_date)].drop_duplicates()
data2.head()
len(data2)#.loc[np.isnan(data2.Previous_Year_Sales)]
# these are the number of missing values
data2.isnull().sum()
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15,10))
data_temp = data[['Store', 'Date', 'Temperature']].drop_duplicates()
sns.boxplot(ax=axs[0, 0], x = 'Store', y = 'Temperature', data = data_temp)
data_temp = data[['Store', 'Date', 'Unemployment']].drop_duplicates()
sns.boxplot(ax=axs[0, 1], x = 'Store', y = 'Unemployment', data = data_temp)
data_temp = data[['Store', 'Date', 'Fuel_Price']].drop_duplicates()
sns.boxplot(ax=axs[1, 0], x = 'Store', y = 'Fuel_Price', data = data_temp)
data_temp = data[['Store', 'Date', 'cpi_new']].drop_duplicates()
sns.boxplot(ax=axs[1, 1], x = 'Store', y = 'cpi_new', data = data_temp)
data.info()
df = data[['Store', 'cpi_new', 'Date']].drop_duplicates().sort_values(['Date', 'Store'])
filter_date = df.loc[df.Date != df['Date'].min()]['Date'].min()
df = df.loc[df.Date == filter_date]
cpi_list = df[['cpi_new']].drop_duplicates()['cpi_new'].tolist()
group_list = []
for i in range(len(cpi_list)):
a = 'R' + str(i+1)
group_list.append(a)
df_temp = df[['cpi_new']].drop_duplicates()
df_temp['region'] = group_list
df = df.merge(df_temp, on='cpi_new', how='left')
data = data.merge(df[['Store', 'region']], on='Store', how = 'left')
df = data[['Store', 'cpi_new', 'Date']].drop_duplicates().sort_values(['Date', 'Store'])
df = df.loc[df.Date == df['Date'].max()]
cpi_list = df[['cpi_new']].drop_duplicates()['cpi_new'].tolist()
group_list = []
for i in range(len(cpi_list)):
a = 'R' + str(i+1)
group_list.append(a)
df_temp = df[['cpi_new']].drop_duplicates()
df_temp['region_2'] = group_list
df = df.merge(df_temp, on='cpi_new', how='left')
data = data.merge(df[['Store', 'region_2']], on='Store', how = 'left')
test_temp = data[['Store', 'region', 'region_2']].drop_duplicates()
len(test_temp.loc[test_temp.region != test_temp.region_2])
data = data.drop('region_2', axis=1)
df = data[['Store', 'cpi_new', 'Temperature']]
mycols = ['Mean_Temp', 'Mean_CPI']
mean_temp = data[['Store', 'Temperature']].drop_duplicates().groupby('Store').mean().rename(columns={'Temperature':'Mean_Temp'})
mean_cpi = data[['Store', 'cpi_new']].drop_duplicates().groupby('Store').mean().rename(columns={'cpi_new':'Mean_CPI'})
df = df.merge(mean_temp, on='Store', how='left')\
.merge(mean_cpi, on='Store', how='left')
min_max_scaler = preprocessing.MinMaxScaler()
data_mm = pd.DataFrame(min_max_scaler.fit_transform(df[mycols].astype(float)))
data_mm.index = df[mycols].index
to_keep = df[mycols].columns.tolist()
to_keep_mm = []
for var in to_keep:
to_keep_mm.append(var + '_mm')
data_mm.columns = to_keep_mm
df = df.join(data_mm)
cluster_data = df[['Store', 'Mean_Temp_mm', 'Mean_CPI_mm']].drop_duplicates()
# cluster based on mean temperature, unemployment, and fuel price
# Perform PCA
pca=PCA(n_components=None) # Create an instance of the PCA class
# Fit the data to the model
pcaResults=pca.fit_transform(cluster_data.drop('Store', axis=1))
explained_variance=pca.explained_variance_ratio_ # Extract the varience
# use the elbow method
wcss=[]
for i in range(1,11):
kmeans=cluster.KMeans(n_clusters=i, init='k-means++', max_iter=100, n_init=10, random_state=0)
kmeans.fit(pcaResults)
wcss.append(kmeans.inertia_)
# Plot the WCSS results
plt.plot(range(1,11), wcss)
plt.title('The elbow method')
plt.xlabel('number of clusters')
plt.ylabel('WCSS')
plt.show()
explained_variance = pca.explained_variance_ratio_ # Extract the varience
print(np.around(explained_variance,3) )
'''
# Let's try 3 clusters
# Perform PCA
pca = PCA(n_components=3) # Now specify 2 components
results = pca.fit_transform(cluster_data.drop('Store', axis=1))
# Show Plot
plt.scatter(results[:,0], results[:,1])
plt.title('PCA Analysis')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()
'''
'''
km2 = cluster.KMeans(n_clusters=3)
np.random.seed(1234)
k2cls = km2.fit(cluster_data.drop('Store', axis=1))
cluster_data['cluster'] = k2cls.labels_
'''
'''
data = data.merge(cluster_data[['Store', 'cluster']], on='Store', how='left')
data['cluster_1'] = 0
data.loc[data.cluster == 1, 'cluster_1'] = 1
data['cluster_2'] = 0
data.loc[data.cluster == 2, 'cluster_2'] = 1
'''
data_temp = data[['Date', 'Temperature', 'region']].drop_duplicates()
sns.boxplot(x = 'region', y = 'Temperature', data = data_temp)
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(30,20))
data_temp = data[['Date', 'Weekly_Sales', 'region', 'Type']].drop_duplicates()
data_temp['ln_ws'] = np.log(data_temp['Weekly_Sales'] + 1)
sns.boxplot(ax = axs[0], x = 'region', y = 'Weekly_Sales', hue = 'Type', data = data_temp)
axs[0].set_xlabel = ('Region')
axs[0].set_ylabel = ('Weekly Sales (US$)')
sns.boxplot(ax = axs[1], x = 'region', y = 'ln_ws', hue = 'Type', data = data_temp)
axs[1].set_xlabel = ('Region')
axs[1].set_ylabel = ('Ln Weekly Sales (US$)')
fig, axs = plt.subplots(nrows=5, ncols=1, figsize=(30,40))
for i in range(5):
promo = 'Promotion' + str(i+1)
data_temp = data[['Date', promo, 'region', 'Type']].drop_duplicates()
sns.boxplot(ax = axs[i], x = 'region', y = promo, hue = 'Type', data = data_temp)
axs[i].set_xlabel = ('Region')
axs[i].set_ylabel = (promo.replace('tion', 'tion '))
fig, axs = plt.subplots(nrows=15, ncols=5, figsize=(30,50))
for i in range(5):
promo = 'Promotion' + str(i+1)
for j in range(15):
region = 'R' + str(j+1)
data.loc[data.region == region].plot.scatter(ax=axs[j, i], y='Weekly_Sales', x=promo)
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20,20))
st=['A', 'B', 'C']
for i in range(3):
data.loc[data.Type == st[i]][['Date', 'region', 'Weekly_Sales']]\
.drop_duplicates().groupby(['region', 'Date']).mean().unstack().T\
.reset_index().drop('level_0', axis=1).set_index('Date')\
.plot(ax=axs[i], legend=False)
axs[i].set_title('Store Type: ' + st[i])
axs[i].set_ylabel('Mean Weekly Sales (US$)')
plt.savefig('RegionsSalesType.png')
fig, axs = plt.subplots(nrows=15, ncols=1, figsize=(20,60))
for i in range(15):
reg = 'R' + str(i+1)
data.loc[data.region == reg][['Date', 'Dept', 'Weekly_Sales']]\
.drop_duplicates().groupby(['Dept', 'Date']).mean().unstack().T\
.reset_index().drop('level_0', axis=1).set_index('Date')\
.plot(ax=axs[i], legend=False)
axs[i].set_title('Region ' + str(i+1))
axs[i].set_ylabel('Mean Weekly Sales (US$)')
plt.savefig('RegionsSalesDept.png')
fig, ax = plt.subplots(figsize=(20,8))
data[['Date', 'region', 'Weekly_Sales']]\
.drop_duplicates().groupby(['region', 'Date']).mean().unstack().T\
.reset_index().drop('level_0', axis=1).set_index('Date')\
.plot(ax=ax, legend=False)
ax.set_ylabel('Mean Weekly Sales (US$)')
plt.savefig('RegionsSales.png')
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(7.5,7.5))
store = ['A', 'B', 'C']
colours = ['blue', 'orange', 'green']
for i in range(len(store)):
data.loc[data.Type == store[i]].plot.scatter(ax=ax, x='Previous_Year_Sales', y='Weekly_Sales', s=0.8, c=colours[i])
plt.savefig('prevSalesType.png')
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(30,10))
store = ['A', 'B', 'C']
colours = ['blue', 'orange', 'green']
for i in range(len(store)):
data.loc[data.Type == store[i]].plot.scatter(ax=axs[i], x='Previous_Year_Sales', y='Weekly_Sales', s=0.8, c=colours[i])
axs[i].set_title('Store Type: ' + store[i])
axs[i].set_xlabel('Weekly Sales Previous Year (US$)', fontsize=12)
axs[i].set_ylabel('Weekly Sales Current Year (US$)', fontsize=12)
plt.savefig('PrevSalesType.png')
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(30,10))
store = ['A', 'B', 'C']
colours = ['blue', 'orange', 'green']
for i in range(len(store)):
df = data.loc[data.Type == store[i]]
sns.regplot(ax=axs[i],
x=df['Weekly_Sales'], y=df['Previous_Year_Sales'],
line_kws={'color' : 'red'},
scatter_kws={'s': 5},
)
axs[i].set_title('Store Type: ' + store[i])
axs[i].set_xlabel('Weekly Sales Previous Year (US$)', fontsize=12)
axs[i].set_ylabel('Weekly Sales Current Year (US$)', fontsize=12)
plt.savefig('PrevSalesTypeLINE.png')
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20,15))
store = ['A', 'B', 'C']
colours = ['blue', 'orange']
variables = ['Weekly_Sales', 'Previous_Year_Sales']
for i in range(len(store)):
for j in range(2):
data.loc[data.Type == store[i]][['Date', variables[j]]].groupby('Date').mean().unstack().T\
.plot.line(ax=axs[i], y=variables[j], x='Date', c=colours[j])
axs[i].set_title('Store Type: ' + store[i])
fig, axs = plt.subplots(nrows=5, ncols=3, figsize=(30,30))
store = ['A', 'B', 'C']
colours = ['blue', 'orange', 'green']
for i in range(len(store)):
for j in range(5):
promo = 'Promotion' + str(j+1)
data.loc[data.Type == store[i]].plot.scatter(ax=axs[j, i],
x=promo,
y='Weekly_Sales',
s=0.8,
c=colours[i])
'''fig, ax = plt.subplots(figsize=(30,10))
sns.boxplot(ax=ax, x = 'Type', y = 'Weekly_Sales', hue = 'cluster', data = data)'''
fig, axs = plt.subplots(nrows=5, ncols=1, figsize=(30, 30))
for i in range(5):
promo = 'Promotion' + str(i+1)
sns.boxplot(ax= axs[i], x = 'Store', y = promo, hue = 'Holiday', data = data.loc[data.Type == 'A'])
axs[i].set_title(promo)
fig, axs = plt.subplots(nrows=5, ncols=1, figsize=(30, 30))
for i in range(5):
promo = 'Promotion' + str(i+1)
sns.boxplot(ax= axs[i], x = 'Store', y = promo, hue = 'Holiday', data = data.loc[data.Type == 'B'])
axs[i].set_title(promo)
fig, axs = plt.subplots(nrows=5, ncols=1, figsize=(30, 30))
for i in range(5):
promo = 'Promotion' + str(i+1)
sns.boxplot(ax= axs[i], x = 'Store', y = promo, hue = 'Holiday', data = data.loc[data.Type == 'C'])
axs[i].set_title(promo)
data_temp = data[['Weekly_Sales',
'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5']]\
.corr()\
#.sort_values('Weekly_Sales', ascending = False)
axsns = plt.subplots(figsize=(20, 10))
sns.heatmap(data_temp, annot=True, cmap='Blues', linewidths=.5)
data_temp = data.loc[data.Type != 'C'][['Weekly_Sales',
'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5']]\
.corr()\
.sort_values('Weekly_Sales', ascending = False)
axsns = plt.subplots(figsize=(20, 10))
sns.heatmap(data_temp, annot=True, cmap='Blues', linewidths=.5)
data_temp = data.loc[data.Type == 'C'][['Weekly_Sales',
'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5']]\
.corr()\
.sort_values('Weekly_Sales', ascending = False)
axsns = plt.subplots(figsize=(20, 10))
sns.heatmap(data_temp, annot=True, cmap='Blues', linewidths=.5)
data_temp = data[['Weekly_Sales',
'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5',
'Christmas', 'Thanksgiving']]\
.corr()\
.sort_values('Weekly_Sales', ascending = False)
axsns = plt.subplots(figsize=(20, 10))
sns.heatmap(data_temp, annot=True, cmap='Blues', linewidths=.5)
# other variables
data_temp = data[['Weekly_Sales', 'Previous_Year_Sales',
#'cpi_new',
'Fuel_Price', 'Temperature', 'Size', 'Type_C',
'Christmas', 'Thanksgiving', 'IsHoliday', 'Imputed_Holiday', 'Unemployment']]
data_temp['colour_correction'] = data_temp['Weekly_Sales'] *-1
axsns = plt.subplots(figsize=(20, 10))
sns.heatmap(data_temp.corr(), annot=True, cmap='RdBu', linewidths=.5)
data.head()
# plot department means (by weeks) by store type
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20,20))
st = ['A', 'B', 'C']
for i in range(3):
data.loc[data.Type == st[i]][['Week', 'Dept', 'Weekly_Sales']].groupby(['Dept', 'Week']).mean().unstack()\
.T.reset_index().drop('level_0', axis=1).set_index('Week')\
.plot(ax=axs[i], legend=False)
axs[i].set_title(st[i])
data_summary = data[['Type', 'Dept', 'Weekly_Sales']].groupby(['Dept', 'Type']).describe().unstack().T.reset_index()
data_summary = data_summary.loc[data_summary.level_1 == 'max'].append(data_summary.loc[data_summary.level_1 == 'min']).drop('level_0', axis=1).set_index(['Type', 'level_1']).T
st = ['A', 'B', 'C']
for i in range(3):
data_summary[st[i], 'range'] = data_summary[st[i], 'max'] - data_summary[st[i], 'min']
data_summary = data.loc[data.Week > 45][['Week', 'Type', 'Dept', 'Weekly_Sales']].groupby(['Dept', 'Type', 'Week']).mean().unstack().reset_index()
data_summary.loc[data_summary[('Type', '')] == 'A'].drop(('Type', ''), axis=1).set_index(('Dept', '')).T\
.plot(figsize=(30, 20))
data_temp = data_summary.loc[data_summary[('Type', '')] == 'A'].drop(('Type', ''), axis=1).set_index(('Dept', ''))
data_temp = data_summary.set_index([('Dept', ''), ('Type', '')]).fillna(0)
for i in range(47, 53):
data_temp[('pct_change', i)] = (data_temp[('Weekly_Sales', int(i))] - data_temp[('Weekly_Sales', int(i-1))])/data_temp[('Weekly_Sales', int(i))] * 100
temp_summary = data_temp['pct_change', ].T.describe().T[['max', 'min']]
temp_summary['range'] = temp_summary['max'] - temp_summary['min']
temp2 = temp_summary[['range']].loc[temp_summary.range != np.inf].unstack()
temp2['xmas_peak'] = 0
temp2.loc[temp2[('range', 'B')] > 100, 'xmas_peak']= 1
#temp2.hist(bins=20, figsize=(20,10))
data = data.merge(temp2.reset_index()[['xmas_peak', 'Dept']].T.reset_index().drop(('Type', ''), axis=1).set_index('level_0').T,
on='Dept', how='left')
data[['Date', 'Type', 'xmas_peak', 'Weekly_Sales']].groupby(['Type', 'xmas_peak', 'Date']).mean().unstack().T.plot(figsize=(20,10))
data['model_group'] = 'ab_np'
data.loc[(data.Type_C == 0) & (data.xmas_peak == 1), 'model_group'] = 'ab_p'
data.loc[(data.Type_C == 1), 'model_group'] = 'c'
data['Cluster'] = '1'
data.loc[data.model_group == 'ab_np', 'Cluster'] = '2'
data.loc[data.model_group == 'ab_p', 'Cluster'] = '3'
data[['Date', 'Cluster', 'Weekly_Sales']].groupby(['Cluster', 'Date']).sum().unstack().T.plot(figsize=(20,10))
plt.savefig('modelgroupSalesDate.png')
'''
sns.set(style="ticks")
sns.pairplot(data[['Weekly_Sales', 'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3',
'Ln_Promotion4', 'Ln_Promotion5', 'model_group']].dropna(), hue='model_group')
'''
'''
sns.set(style="ticks")
sns.pairplot(data[['Weekly_Sales', 'Promotion1', 'Promotion2', 'Promotion3',
'Promotion4', 'Promotion5', 'model_group']].dropna(), hue='model_group')
'''
'''
sns.set(style="ticks")
sns.pairplot(data[['Weekly_Sales', 'Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3',
'Ln_Promotion4', 'Ln_Promotion5', 'Type']].dropna(), hue='Type')
'''
# dept 16 does not fit!!
data.loc[
(data.model_group == 'ab_np') #&(data.Dept !=16)
][['Weekly_Sales', 'Date', 'Dept']].groupby(['Dept', 'Date']).mean().unstack().T\
.plot(figsize=(20, 15))
# dept 3 does not fit!!
data.loc[
(data.model_group == 'ab_p') #&(data.Dept !=3)
][['Weekly_Sales', 'Date', 'Dept']].groupby(['Dept', 'Date']).mean().unstack().T\
.plot(figsize=(20, 15))
# only 3 and 16
data.loc[
(data.Dept==16) | (data.Dept==3)
][['Weekly_Sales', 'Date', 'Dept']].groupby(['Dept', 'Date']).mean().unstack().T\
.plot(figsize=(10, 7))
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(7.5,7.5))
group = ['ab_np', 'ab_p', 'c']
colours = ['blue', 'orange', 'green']
for i in range(len(store)):
data.loc[data.model_group == group[i]].plot.scatter(ax=ax, x='Previous_Year_Sales', y='Weekly_Sales', s=0.8, c=colours[i])
data[['Weekly_Sales', 'model_group']].groupby('model_group').describe()
test = data.loc[data.Type != 'C'][['Dept', 'model_group']].drop_duplicates()
ab_np_list = test.loc[test.model_group == 'ab_np'].sort_values('Dept')['Dept'].tolist()
ab_p_list = test.loc[test.model_group == 'ab_p'].sort_values('Dept')['Dept'].tolist()
print('no peak group: ' + str(len(ab_np_list)), '\n', 'peak group: ' + str(len(ab_p_list)))
data.info()
tidy = data[['Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5', 'cpi_new']]\
.stack()\
.reset_index().set_index('level_0')\
.rename(columns={0: 'value','level_1': 'var'})\
.join(data[['model_group', 'Type', 'Weekly_Sales']])
g = sns.FacetGrid(tidy, row='model_group', col='var', hue='Type')
g = g.map(plt.scatter, 'value', 'Weekly_Sales', s=0.5)
data['ab_np'] = 0
data.loc[data.model_group == 'ab_np', 'ab_np'] = 1
data['ab_p'] = 0
data.loc[data.model_group == 'ab_p', 'ab_p'] = 1
# mean_absolute_percentage_error
def mean_absolute_percentage_error(y_true, y_pred):
return(np.mean(np.abs((y_true - y_pred)/y_true)) *100)
# Model Functions
def linear_model(data, y, x_list):
x_vars = x_list[0]
for i in range(1, len(x_list)):
x_vars += '+' + x_list[i]
function = y + '~' + x_vars
m1=sm.ols(function, data).fit()
return(m1)
def lr_and_print(data, y, x_list):
to_keep = cp.copy(x)
to_keep.append(y)
model_data = data[to_keep].dropna()
m2 = linear_model(model_data, y, x)
rmse = np.sqrt(metrics.mean_squared_error(model_data[y], m2.fittedvalues))
print('RSME: ' + str(rmse))
return(m2)
# run baseline model with only previous sales
y = 'Weekly_Sales'
x = ['Previous_Year_Sales']
m1=lr_and_print(data, y, x)
# run baseline model with only previous sales
y = 'Weekly_Sales'
x = ['Previous_Year_Sales', 'random_int']
m1b=lr_and_print(data, y, x)
m1b.summary()
f,ax=plt.subplots(1, figsize=(9, 6))
sns.kdeplot(data['Weekly_Sales'], shade=True, ax=ax, label='$y$')
sns.kdeplot(m1.fittedvalues, shade=True, ax=ax, label='$\hat{y}_1$')
#sns.kdeplot(m1b.fittedvalues, shade=True, ax=ax, label='$\hat{y}_2$')
plt.show()
Weekly Sales by Store Type
# sensitivity test on promotion data --> for Types A and B
# models
y = 'Weekly_Sales'
x = ['Promotion3', 'Promotion5', 'random_int']
m2a = lr_and_print(data.loc[data.Type != 'C'], y, x)
x = ['I0_Promotion3', 'I0_Promotion5', 'random_int']
m2b = lr_and_print(data.loc[data.Type != 'C'], y, x)
x = ['Imputed_Promotion3', 'Imputed_Promotion5', 'random_int']
m2c = lr_and_print(data.loc[data.Type != 'C'], y, x)
x = ['Ln_Promotion3', 'Ln_Promotion5', 'random_int']
m2d = lr_and_print(data.loc[data.Type != 'C'], y, x)
#plots
data_temp = data.loc[data.Type != 'C']
f,ax=plt.subplots(nrows=1, ncols=4, figsize=(30, 10))
sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[0], label='$y$')
sns.kdeplot(m2a.fittedvalues, shade=True, ax=ax[0], label='$\hat{y}_1$')
sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[1], label='$y$')
sns.kdeplot(m2b.fittedvalues, shade=True, ax=ax[1], label='$\hat{y}_2$')
sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[2], label='$y$')
sns.kdeplot(m2c.fittedvalues, shade=True, ax=ax[2], label='$\hat{y}_2$')
sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[3], label='$y$')
sns.kdeplot(m2d.fittedvalues, shade=True, ax=ax[3], label='$\hat{y}_2$')
plt.show()
m2d.summary()
# sensitivity test on promotion data --> for Type C
# models
y = 'Weekly_Sales'
x = ['Promotion2', 'Promotion4', 'random_int']
m3a = lr_and_print(data.loc[data.Type == 'C'], y, x)
x = ['I0_Promotion2', 'I0_Promotion4', 'random_int']
m3b = lr_and_print(data.loc[data.Type == 'C'], y, x)
x = ['Imputed_Promotion2', 'Imputed_Promotion4', 'random_int']
m3c = lr_and_print(data.loc[data.Type == 'C'], y, x)
x = ['Ln_Promotion2', 'Ln_Promotion4', 'random_int']
m3d = lr_and_print(data.loc[data.Type == 'C'], y, x)
# plots
data_temp = data.loc[data.Type == 'C']
f,ax=plt.subplots(nrows=1, ncols=4, figsize=(30, 10))
sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[0], label='$y$')
sns.kdeplot(m3a.fittedvalues, shade=True, ax=ax[0], label='$\hat{y}_1$')
sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[1], label='$y$')
sns.kdeplot(m3b.fittedvalues, shade=True, ax=ax[1], label='$\hat{y}_2$')
sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[2], label='$y$')
sns.kdeplot(m3c.fittedvalues, shade=True, ax=ax[2], label='$\hat{y}_2$')
sns.kdeplot(data_temp['Weekly_Sales'], shade=True, ax=ax[3], label='$y$')
sns.kdeplot(m3d.fittedvalues, shade=True, ax=ax[3], label='$\hat{y}_2$')
plt.show()
m3d.summary()
# impact of model group as a dummy
y = 'Weekly_Sales'
x = ['model_group', 'Previous_Year_Sales']
m4a = lr_and_print(data, y, x)
m4a.summary()
# impact of model group as a dummy
y = 'Weekly_Sales'
x = ['model_group', 'random_int']
m4a = lr_and_print(data, y, x)
m4a.summary()
# impact of model group as a dummy
y = 'Weekly_Sales'
x = ['Type', 'random_int']
m4a = lr_and_print(data, y, x)
m4a.summary()
# split into test and train sets (last 2 months will be test)
data_temp = data.loc[data.Weekly_Sales != 0]
test_data_og = data_temp.loc[data_temp.Date >= (data['Date'].max()-datetime.timedelta(days=60))]
train_data_og = data_temp.loc[data_temp.Date < (data['Date'].max()-datetime.timedelta(days=60))]
model_variables = ['Weekly_Sales', 'Previous_Year_Sales']
test_data = test_data_og[model_variables].dropna()
test_data.loc[test_data.Weekly_Sales == 0, 'Weekly_Sales'] += 1
train_data = train_data_og[model_variables].dropna()
train_data.loc[train_data.Weekly_Sales == 0, 'Weekly_Sales'] += 1
lr = LinearRegression()
lr.fit(train_data.drop('Weekly_Sales', axis=1), train_data['Weekly_Sales'])
lr_preds = lr.predict(test_data.drop('Weekly_Sales', axis=1))
lr_rmse = np.sqrt(mean_squared_error(test_data['Weekly_Sales'].values, lr_preds))
lr_mape = mean_absolute_percentage_error(test_data['Weekly_Sales'].values, lr_preds)
print(f"RMSE for Linear Regression: {lr_rmse}", '\n', f"MAPE for Linear Regression: {lr_mape}")
model_variables = ['Weekly_Sales', 'Previous_Year_Sales', 'ab_np', 'ab_p']
test_data = test_data_og[model_variables].dropna()
train_data = train_data_og[model_variables].dropna()
lr = LinearRegression()
lr.fit(train_data.drop('Weekly_Sales', axis=1), train_data['Weekly_Sales'])
lr_preds = lr.predict(test_data.drop('Weekly_Sales', axis=1))
lr_rmse = np.sqrt(mean_squared_error(test_data['Weekly_Sales'].values, lr_preds))
lr_mape = mean_absolute_percentage_error(test_data['Weekly_Sales'].values, lr_preds)
print(f"RMSE for Linear Regression: {lr_rmse}", '\n', f"MAPE for Linear Regression: {lr_mape}")
model_variables = ['Weekly_Sales', 'Ln_Promotion3', 'Ln_Promotion5']
test_data = test_data_og[model_variables].dropna()
train_data = train_data_og[model_variables].dropna()
lr = LinearRegression()
lr.fit(train_data.drop('Weekly_Sales', axis=1), train_data['Weekly_Sales'])
lr_preds = lr.predict(test_data.drop('Weekly_Sales', axis=1))
lr_rmse = np.sqrt(mean_squared_error(test_data['Weekly_Sales'].values, lr_preds))
lr_mape = mean_absolute_percentage_error(test_data['Weekly_Sales'].values, lr_preds)
print(f"RMSE for Linear Regression: {lr_rmse}", '\n', f"MAPE for Linear Regression: {lr_mape}")
# random forest regression model on full dataset
data_temp = data[['Weekly_Sales', 'Previous_Year_Sales']].dropna()
data_temp = data_temp.loc[data_temp.Weekly_Sales != 0]
m2=RandomForestRegressor().fit(data_temp[['Previous_Year_Sales']], data_temp['Weekly_Sales'])\
.predict(data_temp[['Previous_Year_Sales']])
# results from m2
rf=pd.Series({'R^2':metrics.r2_score(data_temp['Weekly_Sales'], m2),
'MSE':metrics.mean_squared_error(data_temp['Weekly_Sales'], m2),
'MAE':metrics.mean_absolute_error(data_temp['Weekly_Sales'], m2),
'RSME':np.sqrt(metrics.mean_squared_error(data_temp['Weekly_Sales'], m2)),
'MAPE':mean_absolute_percentage_error(data_temp['Weekly_Sales'], m2)
})
print(rf)
# random forest regression model with cross-validation
train_data = train_data_og[['Previous_Year_Sales', 'Weekly_Sales']].dropna()
test_data = test_data_og[['Previous_Year_Sales', 'Weekly_Sales']].dropna()
m2=RandomForestRegressor().fit(train_data[['Previous_Year_Sales']], train_data['Weekly_Sales'])\
.predict(test_data[['Previous_Year_Sales']])
# results from m2
rf_cv=pd.Series({'R^2':metrics.r2_score(test_data['Weekly_Sales'], m2),
'MSE':metrics.mean_squared_error(test_data['Weekly_Sales'], m2),
'MAE':metrics.mean_absolute_error(test_data['Weekly_Sales'], m2),
'RSME':np.sqrt(metrics.mean_squared_error(test_data['Weekly_Sales'], m2)),
'MAPE':mean_absolute_percentage_error(test_data['Weekly_Sales'], m2)})
print(rf_cv)
# random forest regression model with cross-validation
train_data = train_data_og.loc[train_data_og.model_group == 'ab_np'][['Previous_Year_Sales', 'Weekly_Sales']].dropna()
test_data = test_data_og.loc[test_data_og.model_group == 'ab_np'][['Previous_Year_Sales', 'Weekly_Sales']].dropna()
m2=RandomForestRegressor().fit(train_data[['Previous_Year_Sales']], train_data['Weekly_Sales'])\
.predict(test_data[['Previous_Year_Sales']])
# results from m2
rf_cv=pd.Series({'R^2':metrics.r2_score(test_data['Weekly_Sales'], m2),
'MSE':metrics.mean_squared_error(test_data['Weekly_Sales'], m2),
'MAE':metrics.mean_absolute_error(test_data['Weekly_Sales'], m2),
'RSME':np.sqrt(metrics.mean_squared_error(test_data['Weekly_Sales'], m2)),
'MAPE':mean_absolute_percentage_error(test_data['Weekly_Sales'], m2)})
print(rf_cv)
# random forest regression model with cross-validation
train_data = train_data_og.loc[train_data_og.model_group == 'ab_p'][['Previous_Year_Sales', 'Weekly_Sales']].dropna()
test_data = test_data_og.loc[test_data_og.model_group == 'ab_p'][['Previous_Year_Sales', 'Weekly_Sales']].dropna()
m2=RandomForestRegressor().fit(train_data[['Previous_Year_Sales']], train_data['Weekly_Sales'])\
.predict(test_data[['Previous_Year_Sales']])
# results from m2
rf_cv=pd.Series({'R^2':metrics.r2_score(test_data['Weekly_Sales'], m2),
'MSE':metrics.mean_squared_error(test_data['Weekly_Sales'], m2),
'MAE':metrics.mean_absolute_error(test_data['Weekly_Sales'], m2),
'RSME':np.sqrt(metrics.mean_squared_error(test_data['Weekly_Sales'], m2)),
'MAPE':mean_absolute_percentage_error(test_data['Weekly_Sales'], m2)})
print(rf_cv)
# can maybe do this by department?
train_data = train_data_og[['Previous_Year_Sales', 'Weekly_Sales']].dropna()
test_data = test_data_og[['Previous_Year_Sales', 'Weekly_Sales']].dropna()
# For more than 2 models, use a loop & dict for this
# Create the models
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[['Previous_Year_Sales']], train_data['Weekly_Sales'])
mlp_preds = mlp.predict(test_data[['Previous_Year_Sales']])
mlp_rmse = np.sqrt(mean_squared_error(test_data['Weekly_Sales'].values, mlp_preds))
print(f"RMSE for MLP: {mlp_rmse}" +', ' + f"MAPE: {mean_absolute_percentage_error(test_data[y].values, mlp_preds)}")
# by model_group
y = 'Weekly_Sales'
x = ['Previous_Year_Sales', 'ab_np', 'ab_p']
to_keep = cp.copy(x)
to_keep.append(y)
train_data = train_data_og[to_keep].dropna()
test_data = test_data_og[to_keep].dropna()
# For more than 2 models, use a loop & dict for this
# Create the models
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
print(f"RMSE for MLP: {mlp_rmse}" +', ' + f"MAPE: {mean_absolute_percentage_error(test_data[y].values, mlp_preds)}")
# splitting by departments (A & B Type only)
promo = ['Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5',
'cpi_new', 'Fuel_Price', 'Unemployment', 'Temperature', 'IsHoliday', 'Imputed_Holiday', 'Christmas', 'Thanksgiving',
'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
'I0_Promotion1', 'I0_Promotion2', 'I0_Promotion3', 'I0_Promotion4', 'I0_Promotion5',
'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
'random_int'
]
y = 'Weekly_Sales'
var = []
rmse_value = []
mape_value = []
for i in range(int(len(promo)+1)):
x = ['Previous_Year_Sales']
if i == 0:
text = 'Previous_Sales_Only'
else:
x.append(promo[i-1])
text = promo[i-1]
to_keep = cp.copy(x)
to_keep.append(y)
train_data = train_data_og.loc[train_data_og.model_group == 'ab_np'][to_keep].dropna()
test_data = test_data_og.loc[test_data_og.model_group == 'ab_np'][to_keep].dropna()
# For more than 2 models, use a loop & dict for this
# Create the models
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
rmse_value.append(mlp_rmse)
mape_value.append(mlp_mape)
var.append(text)
#print('Done ' + str(i+1))
ab_np_rmse = pd.DataFrame({'Variable':var, 'RMSE':rmse_value, 'MAPE':mape_value})
ab_np_rmse['Data'] = 'ab_np'
# splitting by departments (A & B Type only)
promo = ['Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5',
'cpi_new', 'Fuel_Price', 'Unemployment', 'Temperature', 'IsHoliday', 'Imputed_Holiday', 'Christmas', 'Thanksgiving',
'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
'I0_Promotion1', 'I0_Promotion2', 'I0_Promotion3', 'I0_Promotion4', 'I0_Promotion5',
'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
'random_int'
]
y = 'Weekly_Sales'
var = []
rmse_value = []
mape_value = []
for i in range(int(len(promo)+1)):
x = ['Previous_Year_Sales']
if i == 0:
text = 'Previous_Sales_Only'
else:
x.append(promo[i-1])
text = promo[i-1]
to_keep = cp.copy(x)
to_keep.append(y)
train_data = train_data_og.loc[train_data_og.model_group == 'ab_p'][to_keep].dropna()
test_data = test_data_og.loc[test_data_og.model_group == 'ab_p'][to_keep].dropna()
# For more than 2 models, use a loop & dict for this
# Create the models
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
rmse_value.append(mlp_rmse)
mape_value.append(mlp_mape)
var.append(text)
ab_p_rmse = pd.DataFrame({'Variable':var, 'RMSE':rmse_value, 'MAPE':mape_value})
ab_p_rmse['Data'] = 'ab_p'
# splitting by departments (C Type only)
promo = ['Ln_Promotion1', 'Ln_Promotion2', 'Ln_Promotion3', 'Ln_Promotion4', 'Ln_Promotion5',
'cpi_new', 'Fuel_Price', 'Unemployment', 'Temperature', 'IsHoliday', 'Imputed_Holiday', 'Christmas', 'Thanksgiving',
'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5',
'I0_Promotion1', 'I0_Promotion2', 'I0_Promotion3', 'I0_Promotion4', 'I0_Promotion5',
'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5',
'random_int'
]
y = 'Weekly_Sales'
var = []
rmse_value = []
mape_value = []
for i in range(int(len(promo)+1)):
x = ['Previous_Year_Sales']
if i == 0:
text = 'Previous_Sales_Only'
else:
x.append(promo[i-1])
text = promo[i-1]
to_keep = cp.copy(x)
to_keep.append(y)
train_data = train_data_og.loc[train_data_og.model_group == 'c'][to_keep].dropna()
test_data = test_data_og.loc[test_data_og.model_group == 'c'][to_keep].dropna()
# For more than 2 models, use a loop & dict for this
# Create the models
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
rmse_value.append(mlp_rmse)
mape_value.append(mlp_mape)
var.append(text)
c_rmse = pd.DataFrame({'Variable':var, 'RMSE':rmse_value, 'MAPE':mape_value})
c_rmse['Data'] = 'c'
rmse_data = ab_np_rmse.append(ab_p_rmse).append(c_rmse)
temp = rmse_data.loc[rmse_data.Data == 'ab_np']
temp_abnp = temp.loc[(temp.RMSE == temp['RMSE'].min()) | (temp.Variable == 'Previous_Sales_Only')]
temp = rmse_data.loc[rmse_data.Data == 'ab_p']
temp_abp = temp.loc[(temp.RMSE == temp['RMSE'].min()) | (temp.Variable == 'Previous_Sales_Only')]
temp = rmse_data.loc[rmse_data.Data == 'c']
temp_c = temp.loc[(temp.RMSE == temp['RMSE'].min()) | (temp.Variable == 'Previous_Sales_Only')]
temp_abnp.append(temp_abp).append(temp_c)
data_temp = pd.read_csv('imputed_data.csv')[['Store', 'Date', 'Imputed_Promotion1', 'Imputed_Promotion2', 'Imputed_Promotion3', 'Imputed_Promotion4', 'Imputed_Promotion5', 'Imputed_Holiday']]
data_temp['Date'] = pd.to_datetime(data_temp['Date'], format='%Y-%m-%d')
test_data = pd.read_csv('test_all.csv')\
.merge(data[['Dept', 'model_group', 'Type']].drop_duplicates(), on=['Type', 'Dept'], how='left')
test_data['Date'] = pd.to_datetime(test_data['Date'], format='%d/%m/%Y')
test_data = test_data.merge(data_temp, on=['Date', 'Store'], how='left')\
.drop(['Year_2', 'Unnamed: 0'], axis=1).drop_duplicates()
py = long_to_wide(data[['Store', 'Dept', 'Year', 'Week', 'Weekly_Sales']], 'Year', 'Weekly_Sales').rename(columns = {2011:2012, 2012:2013})
py1 = wide_to_long(py.drop(2010, axis=1), 'Year', 'Previous_Year_Sales', [2012, 2013])
test_data = test_data.merge(py1, on=['Store', 'Dept', 'Year', 'Week'], how='left').rename(columns={'Weekly_Sales to be predicted':'Weekly_Sales'})
group = 'c'
x = ['Previous_Year_Sales', 'Imputed_Promotion1']
y = 'Weekly_Sales'
link = ['Date', 'Dept', 'Store']
for i in range(len(x)):
a = x[i]
link.append(a)
to_keep = cp.copy(link)
to_keep.append(y)
train_temp = data.loc[data.model_group == group][to_keep].dropna()
test_temp = test_data.loc[test_data.model_group == group][link].dropna()
mlp_c = MLPRegressor(max_iter=100)
mlp.fit(train_temp[x], train_temp[y])
mlp_preds = mlp.predict(test_temp[x])
test_temp['Weekly_Sales'] = mlp_preds
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 20), sharex=True)
train_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#247ab5', legend=False)
test_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#ff8316', legend=False)
axs[0].set_title('Mean Predicted Sales')
axs[0].set_ylabel('Weekly Sales (US$)')
train_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1], c='#247ab5', legend=False)
test_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1],c='#ff8316', legend=False)
axs[1].set_title('Mean Predicted Sales by Store')
axs[1].set_ylabel('Weekly Sales (US$)')
train_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#247ab5', legend=False)
test_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#ff8316', legend=False)
axs[2].set_title('Mean Predicted Sales by Department')
axs[2].set_ylabel('Weekly Sales (US$)')
plt.savefig('predcited_sales_' + group + '.png')
group = 'ab_p'
x = ['Previous_Year_Sales', 'Imputed_Promotion1']
y = 'Weekly_Sales'
link = ['Date', 'Dept', 'Store']
for i in range(len(x)):
a = x[i]
link.append(a)
to_keep = cp.copy(link)
to_keep.append(y)
train_temp = data.loc[data.model_group == group][to_keep].dropna()
test_temp = test_data.loc[test_data.model_group == group][link].dropna()
mlp_c = MLPRegressor(max_iter=100)
mlp.fit(train_temp[x], train_temp[y])
mlp_preds = mlp.predict(test_temp[x])
test_temp['Weekly_Sales'] = mlp_preds
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 20), sharex=True)
train_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#247ab5', legend=False)
test_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#ff8316', legend=False)
axs[0].set_title('Mean Predicted Sales')
axs[0].set_ylabel('Weekly Sales (US$)')
train_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1], c='#247ab5', legend=False)
test_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1],c='#ff8316', legend=False)
axs[1].set_title('Mean Predicted Sales by Store')
axs[1].set_ylabel('Weekly Sales (US$)')
train_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#247ab5', legend=False)
test_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#ff8316', legend=False)
axs[2].set_title('Mean Predicted Sales by Department')
axs[2].set_ylabel('Weekly Sales (US$)')
plt.savefig('predcited_sales_' + group + '.png')
group = 'ab_np'
x = ['Previous_Year_Sales', 'Imputed_Promotion1']
y = 'Weekly_Sales'
link = ['Date', 'Dept', 'Store']
for i in range(len(x)):
a = x[i]
link.append(a)
to_keep = cp.copy(link)
to_keep.append(y)
train_temp = data.loc[data.model_group == group][to_keep].dropna()
test_temp = test_data.loc[test_data.model_group == group][link].dropna()
mlp_c = MLPRegressor(max_iter=100)
mlp.fit(train_temp[x], train_temp[y])
mlp_preds = mlp.predict(test_temp[x])
test_temp['Weekly_Sales'] = mlp_preds
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 20), sharex=True)
train_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#247ab5', legend=False)
test_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#ff8316', legend=False)
axs[0].set_title('Mean Predicted Sales')
axs[0].set_ylabel('Weekly Sales (US$)')
train_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1], c='#247ab5', legend=False)
test_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1],c='#ff8316', legend=False)
axs[1].set_title('Mean Predicted Sales by Store')
axs[1].set_ylabel('Weekly Sales (US$)')
train_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#247ab5', legend=False)
test_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#ff8316', legend=False)
axs[2].set_title('Mean Predicted Sales by Department')
axs[2].set_ylabel('Weekly Sales (US$)')
plt.savefig('predcited_sales_' + group + '.png')
group = 'all_w_dummies'
x = ['Previous_Year_Sales', 'Imputed_Promotion1']
y = 'Weekly_Sales'
link = ['Date', 'Dept', 'Store', 'Type']
dummies = ['ab_np', 'ab_p']
for i in range(len(x)):
a = x[i]
link.append(a)
to_keep = cp.copy(link)
to_keep.append(y)
dummies_data = data[['ab_np', 'ab_p', 'Store', 'Type']].drop_duplicates()
train_temp = data[to_keep].dropna().merge(dummies_data, on=['Store', 'Type'], how='left')
test_temp = test_data[link].dropna().merge(dummies_data, on=['Store', 'Type'], how='left')
to_keep += dummies
x += dummies
mlp_c = MLPRegressor(max_iter=100)
mlp.fit(train_temp[x], train_temp[y])
mlp_preds = mlp.predict(test_temp[x])
test_temp['Weekly_Sales'] = mlp_preds
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 20), sharex=True)
train_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#247ab5', legend=False)
test_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#ff8316', legend=False)
axs[0].set_title('Mean Predicted Sales')
axs[0].set_ylabel('Weekly Sales (US$)')
train_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1], c='#247ab5', legend=False)
test_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1],c='#ff8316', legend=False)
axs[1].set_title('Mean Predicted Sales by Store')
axs[1].set_ylabel('Weekly Sales (US$)')
train_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#247ab5', legend=False)
test_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#ff8316', legend=False)
axs[2].set_title('Mean Predicted Sales by Department')
axs[2].set_ylabel('Weekly Sales (US$)')
plt.savefig('predcited_sales_' + group + '.png')
group = 'all_wo_dummies'
x = ['Previous_Year_Sales']
y = 'Weekly_Sales'
link = ['Date', 'Dept', 'Store', 'Type']
for i in range(len(x)):
a = x[i]
link.append(a)
to_keep = cp.copy(link)
to_keep.append(y)
train_temp = data[to_keep].dropna()
test_temp = test_data[link].dropna()
mlp_c = MLPRegressor(max_iter=100)
mlp.fit(train_temp[x], train_temp[y])
mlp_preds = mlp.predict(test_temp[x])
test_temp['Weekly_Sales'] = mlp_preds
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 20), sharex=True)
train_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#247ab5', legend=False)
test_temp[['Date', 'Weekly_Sales']].groupby(['Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[0], c='#ff8316', legend=False)
axs[0].set_title('Mean Predicted Sales')
axs[0].set_ylabel('Weekly Sales (US$)')
train_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1], c='#247ab5', legend=False)
test_temp[['Store', 'Date', 'Weekly_Sales']].groupby(['Store', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[1],c='#ff8316', legend=False)
axs[1].set_title('Mean Predicted Sales by Store')
axs[1].set_ylabel('Weekly Sales (US$)')
train_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#247ab5', legend=False)
test_temp[['Dept', 'Date', 'Weekly_Sales']].groupby(['Dept', 'Date']).mean().unstack().T.reset_index().drop('level_0', axis=1).set_index('Date').plot(ax=axs[2], c='#ff8316', legend=False)
axs[2].set_title('Mean Predicted Sales by Department')
axs[2].set_ylabel('Weekly Sales (US$)')
plt.savefig('predcited_sales_' + group + '.png')
# previous sales only by model group
y = 'Weekly_Sales'
x = ['Previous_Year_Sales']
var = []
rmse_value = []
mape_value = []
mlp_list = []
to_keep = ['Weekly_Sales', 'Previous_Year_Sales', 'Date', 'Store']
fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(20,10))
#dfs
train_data = train_data_og[to_keep].dropna()
test_data = test_data_og[to_keep].dropna()
#models
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
#extract stats
text=group[i]
rmse_value.append(mlp_rmse)
mape_value.append(mlp_mape)
var.append(text)
#figure
data_temp = cp.copy(test_data)
data_temp['Predicted Sales'] = mlp_preds
data_temp[['Weekly_Sales', 'Date']].rename(columns={'Weekly_Sales':'Actual Sales'}).groupby('Date').mean().plot(ax=axs)
data_temp[['Predicted Sales', 'Date']].groupby('Date').mean().plot(ax=axs)
axs.set_title(text)
axs.set_ylabel('Mean Weekly Sales (US$)')
ps_only = pd.DataFrame({'Variable':var, 'RMSE':rmse_value, 'MAPE':mape_value})
ps_only.head()
# previous sales only by model group
group = ['ab_np', 'ab_p', 'c']
y = 'Weekly_Sales'
x = ['Previous_Year_Sales']
var = []
rmse_value = []
mape_value = []
mlp_list = []
to_keep = ['Weekly_Sales', 'Previous_Year_Sales', 'Date', 'Store']
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(20,10))
for i in range(len(group)):
#dfs
train_data = train_data_og.loc[train_data_og.model_group == group[i]][to_keep].dropna()
test_data = test_data_og.loc[test_data_og.model_group == group[i]][to_keep].dropna()
#models
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
#extract stats
text=group[i]
rmse_value.append(mlp_rmse)
mape_value.append(mlp_mape)
var.append(text)
#figure
data_temp = cp.copy(test_data)
data_temp['Predicted Sales'] = mlp_preds
data_temp[['Weekly_Sales', 'Date']].rename(columns={'Weekly_Sales':'Actual Sales'}).groupby('Date').mean().plot(ax=axs[i])
data_temp[['Predicted Sales', 'Date']].groupby('Date').mean().plot(ax=axs[i])
axs[i].set_title(text)
axs[i].set_ylabel('Mean Weekly Sales (US$)')
ps_only = pd.DataFrame({'Variable':var, 'RMSE':rmse_value, 'MAPE':mape_value})
ps_only.head()
# combining rmse values into one file
data_temp = long_to_wide(rmse_data.drop('MAPE', axis=1), 'Data', 'RMSE').set_index('Variable').T
cols = data_temp.columns.tolist()
for col in cols[1:]:
#data_temp.loc[data_temp[col] > data_temp['Previous_Sales_Only'], col] = np.nan
data_temp[col] = data_temp[col] - data_temp['Previous_Sales_Only']
data_temp['Previous_Sales_Only'] = 0.0
data_temp = data_temp.T.rename(columns={'ab_np':'diff_ab_np', 'ab_p':'diff_ab_p', 'c':'diff_c'})
rmse_data = long_to_wide(rmse_data.drop('MAPE', axis=1), 'Data', 'RMSE').set_index('Variable').join(data_temp)
rmse_data.to_csv('nn_rmse_results2.csv')
rmse_data.sort_values('diff_ab_p')
# testing variable combinations --> ab_np
promo_single = ['Imputed_Promotion5', 'Imputed_Promotion2']
promo = []
for i in range(len(promo_single)):
x = promo_single[i]
for j in range(i+1, len(promo_single)):
y = promo_single[j]
promo.append([x, y])
y = 'Weekly_Sales'
x = ['Previous_Year_Sales']
for i in range(len(promo)):
to_keep = x + promo[i]
to_keep.append(y)
text = promo[i][0] + ' and ' + promo[i][1]
train_data = train_data_og.loc[train_data_og.model_group == 'ab_np'][to_keep].dropna()
test_data = test_data_og.loc[test_data_og.model_group == 'ab_np'][to_keep].dropna()
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
print(text, '\t', f"RMSE: {mlp_rmse}", '\t', f"MAPE: {mlp_mape}")
# testing variable combinations --> ab_p
promo_single = ['Promotion5', 'Promotion1', 'Unemployment', 'Imputed_Holiday']
promo = []
for i in range(len(promo_single)):
x = promo_single[i]
for j in range(i+1, len(promo_single)):
y = promo_single[j]
promo.append([x, y])
y = 'Weekly_Sales'
x = ['Previous_Year_Sales']
for i in range(len(promo)):
to_keep = x + promo[i]
to_keep.append(y)
text = promo[i][0] + ' and ' + promo[i][1]
train_data = train_data_og.loc[train_data_og.model_group == 'ab_p'][to_keep].dropna()
test_data = test_data_og.loc[test_data_og.model_group == 'ab_p'][to_keep].dropna()
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
mlp_mape = mean_absolute_percentage_error(test_data[y].values, mlp_preds)
print(text, '\t', f"RMSE: {mlp_rmse}", '\t', f"MAPE: {mlp_mape}")
# testing variable combinations --> c
promo_single = ['Promotion3', 'Promotion4']
promo = []
for i in range(len(promo_single)):
x = promo_single[i]
for j in range(i+1, len(promo_single)):
y = promo_single[j]
promo.append([x, y])
y = 'Weekly_Sales'
x = ['Previous_Year_Sales']
for i in range(len(promo)):
to_keep = x + promo[i]
to_keep.append(y)
text = promo[i][0] + ' and ' + promo[i][1]
train_data = train_data_og.loc[train_data_og.model_group == 'c'][to_keep].dropna()
test_data = test_data_og.loc[test_data_og.model_group == 'c'][to_keep].dropna()
mlp = MLPRegressor(max_iter=100)
mlp.fit(train_data[x], train_data[y])
mlp_preds = mlp.predict(test_data[x])
mlp_rmse = np.sqrt(mean_squared_error(test_data[y].values, mlp_preds))
print(text, f"RMSE for MLP: {mlp_rmse}")